import matplotlib.pyplot as plt
import json
import zipfile
import pandas as pd
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[4]") \
.appName('complaints_analysis') \
.config("spark.driver.memory", "15g") \
.getOrCreate()
Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/06 21:34:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
data = None
with zipfile.ZipFile("complaints.json.zip", "r") as z:
for filename in z.namelist():
print(filename)
with z.open(filename) as f:
data = f.read()
complaints_df = spark.read.json(filename, multiLine=True) #json.loads(data.decode("utf-8"))
# complaints_df = pd.DataFrame(complaints)
# complaints_df.head()
complaints.json
print(f'The number of complaints in dataset is {complaints_df.count()}')
print(f'The number of features in dataset is {len(complaints_df.columns)}')
[Stage 1:> (0 + 1) / 1]
The number of complaints in dataset is 3127776 The number of features in dataset is 18
complaints_df.show()
[Stage 4:> (0 + 1) / 1]
+--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+ | company|company_public_response| company_response|complaint_id|complaint_what_happened|consumer_consent_provided|consumer_disputed|date_received|date_sent_to_company| issue| product|state| sub_issue| sub_product|submitted_via| tags|timely|zip_code| +--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+ |Experian Informat...| Company has respo...|Closed with non-m...| 6227407| The unverified ac...| Consent provided| N/A| 2022-11-22| 2022-11-22|Incorrect informa...|Credit reporting,...| FL|Information belon...|Credit reporting| Web| | Yes| 32839| | EQUIFAX, INC.| | In progress| 6254497| | | N/A| 2022-11-29| 2022-11-29|Problem with a cr...|Credit reporting,...| CA|Investigation too...|Credit reporting| Web| | Yes| 90045| |Valley Collection...| Company believes ...|Closed with expla...| 6250329| | Other| N/A| 2022-11-29| 2022-11-29|Attempts to colle...| Debt collection| AZ| Debt is not yours| Medical debt| Web| | Yes| 85234| | EQUIFAX, INC.| | In progress| 6254320| | | N/A| 2022-11-29| 2022-11-29|Incorrect informa...|Credit reporting,...| GA|Information belon...|Credit reporting| Web| | Yes| 30097| | CITIBANK, N.A.| Company has respo...|Closed with expla...| 6251994| | Consent not provided| N/A| 2022-11-28| 2022-11-28|Communication tac...| Debt collection| GA|Frequent or repea...|Credit card debt| Web| | Yes| 30315| |Viking Client Ser...| |Closed with expla...| 6249435| | Other| N/A| 2022-11-25| 2022-11-25|Threatened to con...| Debt collection| FL|Talked to a third...| I do not know| Web| | Yes| 33025| |Experian Informat...| Company has respo...|Closed with expla...| 6240644| | Consent not provided| N/A| 2022-11-25| 2022-11-25|Improper use of y...|Credit reporting,...| AL|Reporting company...|Credit reporting| Web| | Yes| 35802| |Experian Informat...| Company has respo...|Closed with expla...| 6240781| | Other| N/A| 2022-11-25| 2022-11-25|Incorrect informa...|Credit reporting,...| MS|Information belon...|Credit reporting| Web| | Yes| 39735| |Experian Informat...| | In progress| 6241534| | | N/A| 2022-11-24| 2022-11-24|Improper use of y...|Credit reporting,...| TX|Reporting company...|Credit reporting| Web| | Yes| 75056| |HYUNDAI CAPITAL A...| Company has respo...|Closed with expla...| 6241749| | Consent not provided| N/A| 2022-11-24| 2022-11-24|Problems at the e...|Vehicle loan or l...| FL|Unable to receive...| Loan| Web|Servicemember| Yes| 33130| |NCB Management Se...| |Closed with expla...| 6128313| | | N/A| 2022-10-25| 2022-10-25|Attempts to colle...| Debt collection| LA| Debt is not yours| I do not know| Web| | Yes| 70817| |JPMORGAN CHASE & CO.| |Closed with expla...| 6206946| | | N/A| 2022-11-15| 2022-11-15|Problem with a le...|Checking or savin...| NY|Transaction was n...|Checking account| Web| | Yes| 10027| | EQUIFAX, INC.| | In progress| 6257631| | | N/A| 2022-11-29| 2022-11-29|Problem with a cr...|Credit reporting,...| OH|Their investigati...|Credit reporting| Web| | Yes| 45251| | EQUIFAX, INC.| | In progress| 6256727| | Other| N/A| 2022-11-29| 2022-11-29|Incorrect informa...|Credit reporting,...| IL|Information belon...|Credit reporting| Web| | Yes| 60429| |FirstPoint Collec...| |Closed with expla...| 6205781| | | N/A| 2022-11-16| 2022-11-16|Attempts to colle...| Debt collection| NC| Debt is not yours| Medical debt| Web| | No| 27406| | EQUIFAX, INC.| | In progress| 6254379| | | N/A| 2022-11-29| 2022-11-29|Problem with a cr...|Credit reporting,...| PA|Their investigati...|Credit reporting| Web| | Yes| 19462| | EQUIFAX, INC.| | In progress| 6254402| | | N/A| 2022-11-29| 2022-11-29|Problem with a cr...|Credit reporting,...| TX|Investigation too...|Credit reporting| Web| | Yes| 75801| | EQUIFAX, INC.| | In progress| 6254475| | | N/A| 2022-11-29| 2022-11-29|Incorrect informa...|Credit reporting,...| FL|Information belon...|Credit reporting| Web| | Yes| 32818| | EQUIFAX, INC.| | In progress| 6254486| | | N/A| 2022-11-29| 2022-11-29|Improper use of y...|Credit reporting,...| MI|Reporting company...|Credit reporting| Web| | Yes| 48125| |HCFS Health Care ...| | In progress| 6242110| | Other| N/A| 2022-11-24| 2022-11-24|Attempts to colle...| Debt collection| OH| Debt is not yours| Medical debt| Web|Servicemember| Yes| 43123| +--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+ only showing top 20 rows
complaints_df.printSchema()
root |-- company: string (nullable = true) |-- company_public_response: string (nullable = true) |-- company_response: string (nullable = true) |-- complaint_id: string (nullable = true) |-- complaint_what_happened: string (nullable = true) |-- consumer_consent_provided: string (nullable = true) |-- consumer_disputed: string (nullable = true) |-- date_received: string (nullable = true) |-- date_sent_to_company: string (nullable = true) |-- issue: string (nullable = true) |-- product: string (nullable = true) |-- state: string (nullable = true) |-- sub_issue: string (nullable = true) |-- sub_product: string (nullable = true) |-- submitted_via: string (nullable = true) |-- tags: string (nullable = true) |-- timely: string (nullable = true) |-- zip_code: string (nullable = true)
Dataset has two date columns but they are in string format. I will convert them to datetime data type
from pyspark.sql.functions import col, to_date, min, max, countDistinct, count, desc, year
complaints_df = complaints_df.withColumn("date_received", to_date("date_received"))
complaints_df = complaints_df.withColumn("date_sent_to_company", to_date("date_sent_to_company"))
complaints_df.show()
[Stage 5:> (0 + 1) / 1]
+--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+ | company|company_public_response| company_response|complaint_id|complaint_what_happened|consumer_consent_provided|consumer_disputed|date_received|date_sent_to_company| issue| product|state| sub_issue| sub_product|submitted_via| tags|timely|zip_code| +--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+ |Experian Informat...| Company has respo...|Closed with non-m...| 6227407| The unverified ac...| Consent provided| N/A| 2022-11-22| 2022-11-22|Incorrect informa...|Credit reporting,...| FL|Information belon...|Credit reporting| Web| | Yes| 32839| | EQUIFAX, INC.| | In progress| 6254497| | | N/A| 2022-11-29| 2022-11-29|Problem with a cr...|Credit reporting,...| CA|Investigation too...|Credit reporting| Web| | Yes| 90045| |Valley Collection...| Company believes ...|Closed with expla...| 6250329| | Other| N/A| 2022-11-29| 2022-11-29|Attempts to colle...| Debt collection| AZ| Debt is not yours| Medical debt| Web| | Yes| 85234| | EQUIFAX, INC.| | In progress| 6254320| | | N/A| 2022-11-29| 2022-11-29|Incorrect informa...|Credit reporting,...| GA|Information belon...|Credit reporting| Web| | Yes| 30097| | CITIBANK, N.A.| Company has respo...|Closed with expla...| 6251994| | Consent not provided| N/A| 2022-11-28| 2022-11-28|Communication tac...| Debt collection| GA|Frequent or repea...|Credit card debt| Web| | Yes| 30315| |Viking Client Ser...| |Closed with expla...| 6249435| | Other| N/A| 2022-11-25| 2022-11-25|Threatened to con...| Debt collection| FL|Talked to a third...| I do not know| Web| | Yes| 33025| |Experian Informat...| Company has respo...|Closed with expla...| 6240644| | Consent not provided| N/A| 2022-11-25| 2022-11-25|Improper use of y...|Credit reporting,...| AL|Reporting company...|Credit reporting| Web| | Yes| 35802| |Experian Informat...| Company has respo...|Closed with expla...| 6240781| | Other| N/A| 2022-11-25| 2022-11-25|Incorrect informa...|Credit reporting,...| MS|Information belon...|Credit reporting| Web| | Yes| 39735| |Experian Informat...| | In progress| 6241534| | | N/A| 2022-11-24| 2022-11-24|Improper use of y...|Credit reporting,...| TX|Reporting company...|Credit reporting| Web| | Yes| 75056| |HYUNDAI CAPITAL A...| Company has respo...|Closed with expla...| 6241749| | Consent not provided| N/A| 2022-11-24| 2022-11-24|Problems at the e...|Vehicle loan or l...| FL|Unable to receive...| Loan| Web|Servicemember| Yes| 33130| |NCB Management Se...| |Closed with expla...| 6128313| | | N/A| 2022-10-25| 2022-10-25|Attempts to colle...| Debt collection| LA| Debt is not yours| I do not know| Web| | Yes| 70817| |JPMORGAN CHASE & CO.| |Closed with expla...| 6206946| | | N/A| 2022-11-15| 2022-11-15|Problem with a le...|Checking or savin...| NY|Transaction was n...|Checking account| Web| | Yes| 10027| | EQUIFAX, INC.| | In progress| 6257631| | | N/A| 2022-11-29| 2022-11-29|Problem with a cr...|Credit reporting,...| OH|Their investigati...|Credit reporting| Web| | Yes| 45251| | EQUIFAX, INC.| | In progress| 6256727| | Other| N/A| 2022-11-29| 2022-11-29|Incorrect informa...|Credit reporting,...| IL|Information belon...|Credit reporting| Web| | Yes| 60429| |FirstPoint Collec...| |Closed with expla...| 6205781| | | N/A| 2022-11-16| 2022-11-16|Attempts to colle...| Debt collection| NC| Debt is not yours| Medical debt| Web| | No| 27406| | EQUIFAX, INC.| | In progress| 6254379| | | N/A| 2022-11-29| 2022-11-29|Problem with a cr...|Credit reporting,...| PA|Their investigati...|Credit reporting| Web| | Yes| 19462| | EQUIFAX, INC.| | In progress| 6254402| | | N/A| 2022-11-29| 2022-11-29|Problem with a cr...|Credit reporting,...| TX|Investigation too...|Credit reporting| Web| | Yes| 75801| | EQUIFAX, INC.| | In progress| 6254475| | | N/A| 2022-11-29| 2022-11-29|Incorrect informa...|Credit reporting,...| FL|Information belon...|Credit reporting| Web| | Yes| 32818| | EQUIFAX, INC.| | In progress| 6254486| | | N/A| 2022-11-29| 2022-11-29|Improper use of y...|Credit reporting,...| MI|Reporting company...|Credit reporting| Web| | Yes| 48125| |HCFS Health Care ...| | In progress| 6242110| | Other| N/A| 2022-11-24| 2022-11-24|Attempts to colle...| Debt collection| OH| Debt is not yours| Medical debt| Web|Servicemember| Yes| 43123| +--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+ only showing top 20 rows
complaints_df.printSchema()
root |-- company: string (nullable = true) |-- company_public_response: string (nullable = true) |-- company_response: string (nullable = true) |-- complaint_id: string (nullable = true) |-- complaint_what_happened: string (nullable = true) |-- consumer_consent_provided: string (nullable = true) |-- consumer_disputed: string (nullable = true) |-- date_received: date (nullable = true) |-- date_sent_to_company: date (nullable = true) |-- issue: string (nullable = true) |-- product: string (nullable = true) |-- state: string (nullable = true) |-- sub_issue: string (nullable = true) |-- sub_product: string (nullable = true) |-- submitted_via: string (nullable = true) |-- tags: string (nullable = true) |-- timely: string (nullable = true) |-- zip_code: string (nullable = true)
min_date, max_date = complaints_df.select(min("date_received"), max("date_received")).first()
print(f'The range of the complaints received date is {min_date.strftime("%m/%d/%Y")} and {max_date.strftime("%m/%d/%Y")}')
[Stage 6:> (0 + 1) / 1]
The range of the complaints received date is 12/01/2011 and 12/05/2022
print(f'The total number of companies in the dataset is {complaints_df.select(countDistinct("company")).collect()[0][0]}')
[Stage 9:> (0 + 1) / 1]
The total number of companies in the dataset is 6581
ComplaintsCountPerCompany = complaints_df.groupBy("company")\
.agg(count("*").alias("ComplaintsCount"))\
.sort(desc("ComplaintsCount"))
ComplaintsCountPerCompany.show()
[Stage 15:> (0 + 1) / 1]
+--------------------+---------------+ | company|ComplaintsCount| +--------------------+---------------+ | EQUIFAX, INC.| 538662| |TRANSUNION INTERM...| 443688| |Experian Informat...| 415222| |BANK OF AMERICA, ...| 120194| |WELLS FARGO & COM...| 101653| |JPMORGAN CHASE & CO.| 96503| | CITIBANK, N.A.| 79735| |CAPITAL ONE FINAN...| 74032| | SYNCHRONY FINANCIAL| 40366| |Navient Solutions...| 37508| |Ocwen Financial C...| 33247| | U.S. BANCORP| 30257| | NATIONSTAR MORTGAGE| 26881| |AMERICAN EXPRESS ...| 25578| |PORTFOLIO RECOVER...| 22299| | DISCOVER BANK| 21496| | PNC Bank N.A.| 21391| |ENCORE CAPITAL GR...| 19989| |TD BANK US HOLDIN...| 19627| |Bread Financial H...| 19279| +--------------------+---------------+ only showing top 20 rows
pl = ComplaintsCountPerCompany.toPandas().head(10).plot(kind="bar",
x="company", y="ComplaintsCount",
figsize=(10, 7), alpha=0.5, color="olive")
pl.set_xlabel("Company")
pl.set_ylabel("Number of complaints")
pl.set_title("Number of complaints by Country")
plt.show()
ComplaintsCountPerYear = complaints_df.groupBy(year("date_received").alias("year"))\
.agg(count("*").alias("ComplaintsCount"))\
.sort("year")
ComplaintsCountPerYear.show()
[Stage 26:> (0 + 1) / 1]
+----+---------------+ |year|ComplaintsCount| +----+---------------+ |2011| 2536| |2012| 72373| |2013| 108217| |2014| 153034| |2015| 168466| |2016| 191457| |2017| 242931| |2018| 257252| |2019| 277324| |2020| 444329| |2021| 496019| |2022| 713838| +----+---------------+
pl = ComplaintsCountPerYear.toPandas().head(10).plot(kind="bar",
x="year", y="ComplaintsCount",
figsize=(10, 7), alpha=0.5, color="olive")
pl.set_xlabel("Company")
pl.set_ylabel("Number of complaints")
pl.set_title("Number of complaints by Country")
plt.show()
ComplaintsCountPerProduct = complaints_df.groupBy("product")\
.agg(count("*").alias("ComplaintsCount"))\
.sort(desc("ComplaintsCount"))
ComplaintsCountPerProduct.show()
[Stage 37:> (0 + 1) / 1]
+--------------------+---------------+ | product|ComplaintsCount| +--------------------+---------------+ |Credit reporting,...| 1452854| | Debt collection| 454053| | Mortgage| 365651| |Credit card or pr...| 166567| |Checking or savin...| 142819| | Credit reporting| 140429| | Credit card| 89190| |Bank account or s...| 86206| | Student loan| 71721| |Money transfer, v...| 48122| |Vehicle loan or l...| 37409| | Consumer Loan| 31596| |Payday loan, titl...| 25367| | Payday loan| 5543| | Money transfers| 5354| | Prepaid card| 3819| |Other financial s...| 1058| | Virtual currency| 18| +--------------------+---------------+
pl = ComplaintsCountPerProduct.toPandas().plot(kind="bar",
x="product", y="ComplaintsCount",
figsize=(10, 7), log=True, alpha=0.5, color="olive")
pl.set_xlabel("Product")
pl.set_ylabel("Number of complaints (Log scale)")
pl.set_title("Number of complaints by product")
plt.show()
print(f'The total number of products in the dataset is {complaints_df.select(countDistinct("product")).collect()[0][0]}')
[Stage 48:> (0 + 1) / 1]
The total number of products in the dataset is 18
complaints_df.select(countDistinct("company_public_response")).collect()[0][0]
12
company_public_response_count = complaints_df.groupBy("company_public_response")\
.agg(count("*").alias("ComplaintsCount"))\
.sort(desc("ComplaintsCount"))
company_public_response_count.show(truncate=False)
[Stage 176:> (0 + 1) / 1]
+-----------------------------------------------------------------------------------------------------------------------+---------------+ |company_public_response |ComplaintsCount| +-----------------------------------------------------------------------------------------------------------------------+---------------+ | |1766304 | |Company has responded to the consumer and the CFPB and chooses not to provide a public response |1153102 | |Company believes it acted appropriately as authorized by contract or law |112514 | |Company chooses not to provide a public response |52473 | |Company believes the complaint is the result of a misunderstanding |11023 | |Company disputes the facts presented in the complaint |8979 | |Company believes complaint caused principally by actions of third party outside the control or direction of the company|6973 | |Company believes complaint is the result of an isolated error |5650 | |Company believes complaint represents an opportunity for improvement to better serve consumers |4242 | |Company can't verify or dispute the facts in the complaint |3860 | |Company believes the complaint provided an opportunity to answer consumer's questions |2557 | |Company believes complaint relates to a discontinued policy or procedure |99 | +-----------------------------------------------------------------------------------------------------------------------+---------------+
company_response_count = complaints_df.groupBy("company_response")\
.agg(count("*").alias("count"))\
.sort(desc("count"))
company_response_count.show(truncate=False)
[Stage 185:> (0 + 1) / 1]
+-------------------------------+-------+ |company_response |count | +-------------------------------+-------+ |Closed with explanation |2417132| |Closed with non-monetary relief|466957 | |Closed with monetary relief |123524 | |In progress |70021 | |Closed without relief |17868 | |Closed |17611 | |Untimely response |9355 | |Closed with relief |5304 | | |4 | +-------------------------------+-------+
complaints_df.select("submitted_via").distinct().show()
[Stage 209:> (0 + 1) / 1]
+-------------+ |submitted_via| +-------------+ | Phone| | Fax| | Email| | Web Referral| | Referral| | Postal mail| | Web| +-------------+
complaints_df.select('issue').distinct().show()
[Stage 212:> (0 + 1) / 1]
+--------------------+ | issue| +--------------------+ |Communication tac...| |Application proce...| |Unexpected or oth...| |Advertising and m...| |Problem with an o...| |Getting a loan or...| |Balance transfer fee| |Customer service/...| | Adding money| |Credit card prote...| |Closing/Cancellin...| |Received a loan I...| |Vehicle was repos...| |Can't stop charge...| | Fees| | Bankruptcy| |Forbearance / Wor...| |Credit determination| |Confusing or miss...| |Loan modification...| +--------------------+ only showing top 20 rows
complaints_df.select('state').distinct().show()
[Stage 215:> (0 + 1) / 1]
+--------------------+ | state| +--------------------+ |UNITED STATES MIN...| | AZ| | SC| | LA| | MN| | AA| | NJ| | DC| | OR| | VA| | RI| | KY| | WY| | NH| | MI| | NV| | WI| | ID| | CA| | CT| +--------------------+ only showing top 20 rows
print(f'The total number of state in the dataset is {complaints_df.select(countDistinct("state")).collect()[0][0]}')
[Stage 224:> (0 + 1) / 1]
The total number of state in the dataset is 64
statewise_complaints_count = complaints_df.groupBy("state")\
.agg(count("*").alias("count"))\
.sort(desc("count"))
statewise_complaints_count.show(truncate=False)
[Stage 54:> (0 + 1) / 1]
+-----+------+ |state|count | +-----+------+ |CA |381389| |FL |350313| |TX |305289| |NY |204477| |GA |197188| |PA |141286| |IL |119176| |NJ |107395| |NC |98699 | |OH |84040 | |MD |83916 | |VA |79859 | |MI |69414 | |TN |61064 | |AL |57789 | |AZ |57321 | |SC |54923 | |LA |49813 | |MA |49303 | |WA |44082 | +-----+------+ only showing top 20 rows
import plotly.express as px
fig = px.choropleth(statewise_complaints_count.toPandas(),
locations='state',
locationmode="USA-states",
scope="usa",
color='count',
color_continuous_scale="Viridis_r",
)
fig.update_layout(
title_text = 'Number of complaints by State (2011-2020)',
title_font_family="Times New Roman",
title_font_size = 22,
title_font_color="black",
title_x=0.45,
)
fig.show()
statewise_complaints_per_day_count = complaints_df.groupBy(["state", year("date_received").alias('year')])\
.agg(count("*").alias("count"))\
.sort("year")
statewise_complaints_per_day_count.show(truncate=False)
[Stage 65:> (0 + 1) / 1]
+-----+----+-----+ |state|year|count| +-----+----+-----+ |SC |2011|21 | |LA |2011|12 | |NE |2011|4 | |IA |2011|13 | |GA |2011|99 | |DC |2011|11 | |WA |2011|64 | |WI |2011|24 | |KS |2011|15 | |ND |2011|1 | |HI |2011|13 | |DE |2011|15 | |AK |2011|5 | |AR |2011|9 | |MA |2011|71 | |WV |2011|7 | |RI |2011|10 | |SD |2011|10 | |OH |2011|69 | |TN |2011|37 | +-----+----+-----+ only showing top 20 rows
df = statewise_complaints_per_day_count.toPandas()
df.head()
| state | year | count | |
|---|---|---|---|
| 0 | MT | 2011 | 7 |
| 1 | IA | 2011 | 13 |
| 2 | TN | 2011 | 37 |
| 3 | DC | 2011 | 11 |
| 4 | NV | 2011 | 18 |
df['year'] = df['year'].astype(str)
df.head()
| state | year | count | |
|---|---|---|---|
| 0 | MT | 2011 | 7 |
| 1 | IA | 2011 | 13 |
| 2 | TN | 2011 | 37 |
| 3 | DC | 2011 | 11 |
| 4 | NV | 2011 | 18 |
fig = px.choropleth(df,
locations='state',
locationmode="USA-states",
color='count',
color_continuous_scale="Viridis_r",
scope="usa",
animation_frame='year')
fig.update_layout(
title_text = 'Number of complaints by State (2011-2020)',
title_font_family="Times New Roman",
title_font_size = 22,
title_font_color="black",
title_x=0.45,
)
fig.show()
spark.stop()